setup libraries and working directory

# General 
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(skimr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# date
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
#track time
#devtools::install_github("jabiru/tictoc")
library(tictoc)

#install.packages("Hmisc")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units

import csv file

loans <- read_csv("Loans.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   loan_amnt = col_integer(),
##   funded_amnt = col_integer(),
##   installment = col_double(),
##   annual_inc = col_double(),
##   dti = col_double(),
##   delinq_2yrs = col_integer(),
##   inq_last_6mths = col_integer(),
##   mths_since_last_delinq = col_integer(),
##   mths_since_last_record = col_integer(),
##   open_acc = col_integer(),
##   pub_rec = col_integer(),
##   revol_bal = col_integer(),
##   revol_util = col_double(),
##   total_acc = col_integer(),
##   policy_code = col_integer(),
##   loan_defaulted = col_integer()
## )
## See spec(...) for full column specifications.

Explore data

## cols(
##   loan_amnt = col_integer(),
##   funded_amnt = col_integer(),
##   term = col_character(),
##   int_rate = col_character(),
##   installment = col_double(),
##   grade = col_character(),
##   sub_grade = col_character(),
##   emp_title = col_character(),
##   emp_length = col_character(),
##   home_ownership = col_character(),
##   annual_inc = col_double(),
##   verification_status = col_character(),
##   pymnt_plan = col_character(),
##   desc = col_character(),
##   purpose = col_character(),
##   title = col_character(),
##   zip_code = col_character(),
##   addr_state = col_character(),
##   dti = col_double(),
##   delinq_2yrs = col_integer(),
##   earliest_cr_line = col_character(),
##   inq_last_6mths = col_integer(),
##   mths_since_last_delinq = col_integer(),
##   mths_since_last_record = col_integer(),
##   open_acc = col_integer(),
##   pub_rec = col_integer(),
##   revol_bal = col_integer(),
##   revol_util = col_double(),
##   total_acc = col_integer(),
##   initial_list_status = col_character(),
##   mths_since_last_major_derog = col_character(),
##   policy_code = col_integer(),
##   loan_defaulted = col_integer()
## )
## # A tibble: 6 x 33
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 26 more variables: emp_title <chr>, emp_length <chr>,
## #   home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## #   pymnt_plan <chr>, desc <chr>, purpose <chr>, title <chr>,
## #   zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <int>,
## #   earliest_cr_line <chr>, inq_last_6mths <int>,
## #   mths_since_last_delinq <int>, mths_since_last_record <int>,
## #   open_acc <int>, pub_rec <int>, revol_bal <int>, revol_util <dbl>,
## #   total_acc <int>, initial_list_status <chr>,
## #   mths_since_last_major_derog <chr>, policy_code <int>,
## #   loan_defaulted <int>
##    loan_amnt      funded_amnt        term             int_rate        
##  Min.   :  500   Min.   :  500   Length:10000       Length:10000      
##  1st Qu.: 5000   1st Qu.: 5000   Class :character   Class :character  
##  Median : 9600   Median : 9250   Mode  :character   Mode  :character  
##  Mean   :11036   Mean   :10766                                        
##  3rd Qu.:15000   3rd Qu.:15000                                        
##  Max.   :35000   Max.   :35000                                        
##                                                                       
##   installment         grade            sub_grade        
##  Min.   :  15.69   Length:10000       Length:10000      
##  1st Qu.: 163.47   Class :character   Class :character  
##  Median : 275.11   Mode  :character   Mode  :character  
##  Mean   : 321.43                                        
##  3rd Qu.: 427.04                                        
##  Max.   :1276.60                                        
##                                                         
##   emp_title          emp_length        home_ownership       annual_inc    
##  Length:10000       Length:10000       Length:10000       Min.   :  2000  
##  Class :character   Class :character   Class :character   1st Qu.: 40000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 58000  
##                                                           Mean   : 68203  
##                                                           3rd Qu.: 82000  
##                                                           Max.   :900000  
##                                                           NA's   :1       
##  verification_status  pymnt_plan            desc          
##  Length:10000        Length:10000       Length:10000      
##  Class :character    Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character  
##                                                           
##                                                           
##                                                           
##                                                           
##    purpose             title             zip_code        
##  Length:10000       Length:10000       Length:10000      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   addr_state             dti         delinq_2yrs      earliest_cr_line  
##  Length:10000       Min.   : 0.00   Min.   : 0.0000   Length:10000      
##  Class :character   1st Qu.: 8.16   1st Qu.: 0.0000   Class :character  
##  Mode  :character   Median :13.41   Median : 0.0000   Mode  :character  
##                     Mean   :13.34   Mean   : 0.1482                     
##                     3rd Qu.:18.69   3rd Qu.: 0.0000                     
##                     Max.   :29.99   Max.   :11.0000                     
##                                     NA's   :5                           
##  inq_last_6mths   mths_since_last_delinq mths_since_last_record
##  Min.   : 0.000   Min.   :  0.00         Min.   :  0.00        
##  1st Qu.: 0.000   1st Qu.: 18.00         1st Qu.:  0.00        
##  Median : 1.000   Median : 34.00         Median : 86.00        
##  Mean   : 1.067   Mean   : 35.89         Mean   : 61.65        
##  3rd Qu.: 2.000   3rd Qu.: 53.00         3rd Qu.:101.00        
##  Max.   :25.000   Max.   :120.00         Max.   :119.00        
##  NA's   :5        NA's   :6316           NA's   :9160          
##     open_acc         pub_rec          revol_bal         revol_util    
##  Min.   : 1.000   Min.   :0.00000   Min.   :      0   Min.   :  0.00  
##  1st Qu.: 6.000   1st Qu.:0.00000   1st Qu.:   3524   1st Qu.: 25.00  
##  Median : 9.000   Median :0.00000   Median :   8646   Median : 48.70  
##  Mean   : 9.335   Mean   :0.06013   Mean   :  14271   Mean   : 48.47  
##  3rd Qu.:12.000   3rd Qu.:0.00000   3rd Qu.:  16952   3rd Qu.: 71.90  
##  Max.   :39.000   Max.   :3.00000   Max.   :1207359   Max.   :108.80  
##  NA's   :5        NA's   :5                           NA's   :23      
##    total_acc     initial_list_status mths_since_last_major_derog
##  Min.   : 1.00   Length:10000        Length:10000               
##  1st Qu.:13.00   Class :character    Class :character           
##  Median :20.00   Mode  :character    Mode  :character           
##  Mean   :22.09                                                  
##  3rd Qu.:29.00                                                  
##  Max.   :90.00                                                  
##  NA's   :5                                                      
##   policy_code loan_defaulted  
##  Min.   :1    Min.   :0.0000  
##  1st Qu.:1    1st Qu.:0.0000  
##  Median :1    Median :0.0000  
##  Mean   :1    Mean   :0.1295  
##  3rd Qu.:1    3rd Qu.:0.0000  
##  Max.   :1    Max.   :1.0000  
## 

## # A tibble: 2 x 2
##   loan_defaulted count
##            <int> <int>
## 1              0  8705
## 2              1  1295
## # A tibble: 1 x 1
##    rate
##   <dbl>
## 1 0.130

Total NAs

sum(is.na(loans))/(nrow(loans)*ncol(loans)) 
## [1] 0.08894242

Missing character data

missing_character_values_tbl <- loans %>%
    select_if(is.character) %>%
    map_df(~ is.na(.) %>% sum()) %>%
    gather() %>%
    arrange(desc(value)) %>%
    mutate(key = as_factor(key))

missing_character_values_tbl
## # A tibble: 17 x 2
##    key                         value
##    <fct>                       <int>
##  1 mths_since_last_major_derog 10000
##  2 desc                         3229
##  3 emp_title                     589
##  4 earliest_cr_line                5
##  5 title                           3
##  6 term                            0
##  7 int_rate                        0
##  8 grade                           0
##  9 sub_grade                       0
## 10 emp_length                      0
## 11 home_ownership                  0
## 12 verification_status             0
## 13 pymnt_plan                      0
## 14 purpose                         0
## 15 zip_code                        0
## 16 addr_state                      0
## 17 initial_list_status             0

Missing numeric data

missing_numeric_values_tbl <- loans %>%
    select_if(is.numeric) %>%
    map_df(~ is.na(.) %>% sum()) %>%
    gather() %>%
    arrange(desc(value)) %>%
    mutate(key = as_factor(key))

missing_numeric_values_tbl
## # A tibble: 16 x 2
##    key                    value
##    <fct>                  <int>
##  1 mths_since_last_record  9160
##  2 mths_since_last_delinq  6316
##  3 revol_util                23
##  4 delinq_2yrs                5
##  5 inq_last_6mths             5
##  6 open_acc                   5
##  7 pub_rec                    5
##  8 total_acc                  5
##  9 annual_inc                 1
## 10 loan_amnt                  0
## 11 funded_amnt                0
## 12 installment                0
## 13 dti                        0
## 14 revol_bal                  0
## 15 policy_code                0
## 16 loan_defaulted             0

Missing annual income

loans%>%filter(is.na(annual_inc))
## # A tibble: 1 x 33
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
## 1      6700        6700 36 m… 7.75%           209. A     A3       
## # ... with 26 more variables: emp_title <chr>, emp_length <chr>,
## #   home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## #   pymnt_plan <chr>, desc <chr>, purpose <chr>, title <chr>,
## #   zip_code <chr>, addr_state <chr>, dti <dbl>, delinq_2yrs <int>,
## #   earliest_cr_line <chr>, inq_last_6mths <int>,
## #   mths_since_last_delinq <int>, mths_since_last_record <int>,
## #   open_acc <int>, pub_rec <int>, revol_bal <int>, revol_util <dbl>,
## #   total_acc <int>, initial_list_status <chr>,
## #   mths_since_last_major_derog <chr>, policy_code <int>,
## #   loan_defaulted <int>

desc vs. purpose deep dive

I have seen some cases where the identified puprose category does not match the desc. A proper deep dive into desc text in “Phase 2 For now will use purpose feature in modelling as I suspect the vast majority of cases would be accurate (80/20 approach)”

loans %>% select( purpose, title,desc)%>% unique()%>% arrange(purpose) %>%group_by(purpose)%>%top_n(10)
## Selecting by desc
## # A tibble: 140 x 3
## # Groups:   purpose [14]
##    purpose title                   desc                                   
##    <chr>   <chr>                   <chr>                                  
##  1 car     FICO score 762 want's … This loan would be for a 2006 PT Cruis…
##  2 car     purchase work pickup t… This loan would purchase a small picku…
##  3 car     personal                buying a car for my son                
##  4 car     Car Payment             To pay off car loan with high interest…
##  5 car     Toby's Truck            We are helping our son purchase a truc…
##  6 car     Truck Loan              This loan request is for a fantastic f…
##  7 car     vehicle loan            a loan to purchase a vehicle           
##  8 car     Loan                    Would like to try to purchase a new ca…
##  9 car     Debt Consolidation      Trying to consolidate all my loans. Th…
## 10 car     Purchase car for daugh… Trying to purchase 2001 Honda Accord. …
## # ... with 130 more rows

emp title deep dive (Phase 2)

loans %>% select(emp_title)%>% group_by(emp_title)%>% summarise(count=n())%>%arrange(desc(count))
## # A tibble: 8,185 x 2
##    emp_title               count
##    <chr>                   <int>
##  1 <NA>                      589
##  2 US Army                    37
##  3 Bank of America            23
##  4 IBM                        22
##  5 US Navy                    17
##  6 USAF                       17
##  7 United States Air Force    16
##  8 Wells Fargo                15
##  9 AT&T                       14
## 10 Self Employed              14
## # ... with 8,175 more rows

Unique character values

loans %>% select_if(is.character) %>% names()
##  [1] "term"                        "int_rate"                   
##  [3] "grade"                       "sub_grade"                  
##  [5] "emp_title"                   "emp_length"                 
##  [7] "home_ownership"              "verification_status"        
##  [9] "pymnt_plan"                  "desc"                       
## [11] "purpose"                     "title"                      
## [13] "zip_code"                    "addr_state"                 
## [15] "earliest_cr_line"            "initial_list_status"        
## [17] "mths_since_last_major_derog"
unique_character_values_tbl <- loans %>%
    select_if(is.character) %>%
    map_df(~ unique(.) %>% length()) %>%
    gather() %>%
    arrange(value) %>%
    mutate(key = as_factor(key))

unique_character_values_tbl
## # A tibble: 17 x 2
##    key                         value
##    <fct>                       <int>
##  1 initial_list_status             1
##  2 mths_since_last_major_derog     1
##  3 term                            2
##  4 pymnt_plan                      2
##  5 verification_status             3
##  6 home_ownership                  5
##  7 grade                           7
##  8 emp_length                     12
##  9 purpose                        14
## 10 sub_grade                      35
## 11 addr_state                     50
## 12 int_rate                      359
## 13 earliest_cr_line              464
## 14 zip_code                      720
## 15 title                        5678
## 16 desc                         6763
## 17 emp_title                    8185

Unique numeric values

loans %>% select_if(is.numeric) %>% names()
##  [1] "loan_amnt"              "funded_amnt"           
##  [3] "installment"            "annual_inc"            
##  [5] "dti"                    "delinq_2yrs"           
##  [7] "inq_last_6mths"         "mths_since_last_delinq"
##  [9] "mths_since_last_record" "open_acc"              
## [11] "pub_rec"                "revol_bal"             
## [13] "revol_util"             "total_acc"             
## [15] "policy_code"            "loan_defaulted"
unique_numeric_values_tbl <- loans %>%
    select_if(is.numeric) %>%
    map_df(~ unique(.) %>% length()) %>%
    gather() %>%
    arrange(value) %>%
    mutate(key = as_factor(key))

unique_numeric_values_tbl
## # A tibble: 16 x 2
##    key                    value
##    <fct>                  <int>
##  1 policy_code                1
##  2 loan_defaulted             2
##  3 pub_rec                    5
##  4 delinq_2yrs               11
##  5 inq_last_6mths            21
##  6 open_acc                  37
##  7 total_acc                 75
##  8 mths_since_last_delinq    92
##  9 mths_since_last_record    95
## 10 loan_amnt                573
## 11 funded_amnt              727
## 12 revol_util              1031
## 13 annual_inc              1902
## 14 dti                     2585
## 15 installment             6381
## 16 revol_bal               8130

drop features due too many missing, unary type, too many unique levels or correlated features.

Note: emp_title(“Phase 2”)

drop_features <- c("initial_list_status","mths_since_last_major_derog","mths_since_last_record","desc"
                   ,"title","policy_code","emp_title")

loans01 <- loans %>% select(-drop_features)

head(loans01)
## # A tibble: 6 x 26
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 19 more variables: emp_length <chr>, home_ownership <chr>,
## #   annual_inc <dbl>, verification_status <chr>, pymnt_plan <chr>,
## #   purpose <chr>, zip_code <chr>, addr_state <chr>, dti <dbl>,
## #   delinq_2yrs <int>, earliest_cr_line <chr>, inq_last_6mths <int>,
## #   mths_since_last_delinq <int>, open_acc <int>, pub_rec <int>,
## #   revol_bal <int>, revol_util <dbl>, total_acc <int>,
## #   loan_defaulted <int>

Impute and transform numeric features

Note: for analysis purposes mean value imputation is being completed pre-modelling, however, going forward best practice is to incorporate during modelling based on n-cross validation to ensure robust modelling outcomes and minimising and leakage and/or overtraining.

replace_with_0 <- c("mths_since_last_delinq","delinq_2yrs","inq_last_6mths","open_acc","pub_rec","total_acc")


# Replace missing values with 0
for (i in seq_along(replace_with_0)){
loans01[is.na(loans01[,replace_with_0[i]]), replace_with_0[i]] <- 0
}


#Replace missing with mean value

mean_revol_util<-loans01 %>% select_("revol_util") %>% summarise(mean(revol_util,na.rm = TRUE))
mean_annual_inc <- loans01 %>% select_("annual_inc") %>% summarise(mean(annual_inc,na.rm = TRUE))
loans01 %>% filter(revol_bal==0)
## # A tibble: 278 x 26
##    loan_amnt funded_amnt term  int_rate installment grade sub_grade
##        <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
##  1      8000        8000 36 m… 14.61%         276.  D     D1       
##  2      7800        7800 60 m… 10.00%         166.  B     B2       
##  3     16750       16750 36 m… 12.53%         561.  B     B5       
##  4      2750        2750 36 m… 16.95%          98.0 E     E5       
##  5      6450        6450 36 m… 5.99%          196.  A     A2       
##  6      7000        7000 36 m… 7.51%          218.  A     A3       
##  7     15000       15000 60 m… 21.28%         408.  F     F2       
##  8      5000        5000 36 m… 13.57%         170.  C     C3       
##  9      7000        7000 36 m… 6.99%          216.  A     A3       
## 10      1600        1600 36 m… 14.42%          55.0 D     D2       
## # ... with 268 more rows, and 19 more variables: emp_length <chr>,
## #   home_ownership <chr>, annual_inc <dbl>, verification_status <chr>,
## #   pymnt_plan <chr>, purpose <chr>, zip_code <chr>, addr_state <chr>,
## #   dti <dbl>, delinq_2yrs <dbl>, earliest_cr_line <chr>,
## #   inq_last_6mths <dbl>, mths_since_last_delinq <dbl>, open_acc <dbl>,
## #   pub_rec <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   loan_defaulted <int>
loans01[is.na(loans01[,"revol_util"]), "revol_util"] <- mean_revol_util
loans01[is.na(loans01[,"annual_inc"]), "annual_inc"] <- mean_annual_inc

#Log transform right skewed features
loans01$annual_inc_log <- log(loans01$annual_inc+1)
loans01$loan_amnt_log <- log(loans01$loan_amnt+1)
loans01$funded_amnt_log <- log(loans01$funded_amnt+1)
loans01$revol_bal_log <- log(loans01$revol_bal+1)
loans01$installment_log <- log(loans01$installment+1)



#New features based on business knowledge/interactions
loans01$funded_loan_amnt_ratio <- (loans01$funded_amnt/loans01$loan_amnt)^3
loans01$installment_inc_ratio <- 12*loans01$installment/loans01$annual_inc
loans01$open_total_acc_ratio <- ifelse(loans01$total_acc==0,0,loans01$open_acc/(loans01$total_acc))

Impute character features and change to factors

#categorise numeric features
loans01$pub_rec_flag <- as.factor(ifelse(loans01$pub_rec>0,1,loans01$pub_rec))

Convert character to date and numeric

#new features
#earliest_cr_line
loans01 %>% group_by(earliest_cr_line) %>% summarise(count=n()) %>% arrange(earliest_cr_line)
## # A tibble: 464 x 2
##    earliest_cr_line count
##    <chr>            <int>
##  1 1/01/1963            1
##  2 1/01/1968            2
##  3 1/01/1972            1
##  4 1/01/1973            3
##  5 1/01/1975            1
##  6 1/01/1976            2
##  7 1/01/1977            3
##  8 1/01/1978            3
##  9 1/01/1979            6
## 10 1/01/1980            3
## # ... with 454 more rows
loans01$earliest_cr_line_date <-  as.Date(loans01$earliest_cr_line,"%m/%d/%Y",na.action(omit))
loans01$earliest_cr_line_years <- as.numeric(time_length(interval(loans01$earliest_cr_line_date,as.Date("2018-01-01")),"years"))

#impute missing numeric for earliest_cr_years
mean_earliest_cr_line_years <- loans01 %>% select_("earliest_cr_line_years") %>% summarise(mean(earliest_cr_line_years,na.rm = TRUE))
loans01[is.na(loans01[,"earliest_cr_line_years"]), "earliest_cr_line_years"] <- mean_earliest_cr_line_years

loans01 %>% filter(is.na(earliest_cr_line))
## # A tibble: 5 x 37
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
## 1      3500        3500 36 m… 10.28%         113.  C     C1       
## 2      1000        1000 36 m… 7.12%           30.9 A     A1       
## 3      2000        2000 36 m… 9.96%           64.5 B     B5       
## 4      2600        2600 36 m… 8.38%           81.9 A     A5       
## 5      6700        6700 36 m… 7.75%          209.  A     A3       
## # ... with 30 more variables: emp_length <chr>, home_ownership <chr>,
## #   annual_inc <dbl>, verification_status <chr>, pymnt_plan <chr>,
## #   purpose <chr>, zip_code <chr>, addr_state <chr>, dti <dbl>,
## #   delinq_2yrs <dbl>, earliest_cr_line <chr>, inq_last_6mths <dbl>,
## #   mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## #   revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## #   funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## #   funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## #   open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## #   earliest_cr_line_date <date>, earliest_cr_line_years <dbl>
head(loans01)
## # A tibble: 6 x 37
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <chr> <chr>          <dbl> <chr> <chr>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 30 more variables: emp_length <chr>, home_ownership <chr>,
## #   annual_inc <dbl>, verification_status <chr>, pymnt_plan <chr>,
## #   purpose <chr>, zip_code <chr>, addr_state <chr>, dti <dbl>,
## #   delinq_2yrs <dbl>, earliest_cr_line <chr>, inq_last_6mths <dbl>,
## #   mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## #   revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## #   funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## #   funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## #   open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## #   earliest_cr_line_date <date>, earliest_cr_line_years <dbl>
loans01 %>% group_by(term) %>% summarise(count=n())
## # A tibble: 2 x 2
##   term      count
##   <chr>     <int>
## 1 36 months  7441
## 2 60 months  2559
loans01 %>% group_by(purpose) %>% summarise(count=n())
## # A tibble: 14 x 2
##    purpose            count
##    <chr>              <int>
##  1 car                  352
##  2 credit_card         1290
##  3 debt_consolidation  4556
##  4 educational          112
##  5 home_improvement     811
##  6 house                 97
##  7 major_purchase       551
##  8 medical              185
##  9 moving               162
## 10 other               1044
## 11 renewable_energy      19
## 12 small_business       473
## 13 vacation              96
## 14 wedding              252
#convert to integer/numeric
loans01 %>% group_by(int_rate) %>% summarise(count=n())
## # A tibble: 359 x 2
##    int_rate count
##    <chr>    <int>
##  1 10.00%      56
##  2 10.01%       4
##  3 10.08%      15
##  4 10.14%       1
##  5 10.20%       5
##  6 10.25%      58
##  7 10.28%       7
##  8 10.33%       1
##  9 10.36%      68
## 10 10.37%     117
## # ... with 349 more rows
loans01$int_rate_num <- as.numeric(gsub("%","",loans01$int_rate))

loans01 %>% group_by(emp_length) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 12 x 2
##    emp_length count
##    <chr>      <int>
##  1 10+ years   2185
##  2 < 1 year    1183
##  3 2 years     1147
##  4 3 years     1017
##  5 4 years      899
##  6 1 year       872
##  7 5 years      788
##  8 6 years      539
##  9 7 years      424
## 10 8 years      353
## 11 9 years      334
## 12 n/a          259
loans01$emp_length_num <- as.numeric(gsub("\\D","",loans01$emp_length))
loans01 %>% filter(emp_length=="n/a") %>% select(emp_length,emp_length_num,annual_inc,loan_defaulted)
## # A tibble: 259 x 4
##    emp_length emp_length_num annual_inc loan_defaulted
##    <chr>               <dbl>      <dbl>          <int>
##  1 n/a                    NA      55200              0
##  2 n/a                    NA      63000              0
##  3 n/a                    NA      20000              1
##  4 n/a                    NA      12360              0
##  5 n/a                    NA      30000              0
##  6 n/a                    NA      30000              0
##  7 n/a                    NA      28000              0
##  8 n/a                    NA      61000              0
##  9 n/a                    NA      29736              0
## 10 n/a                    NA      66000              0
## # ... with 249 more rows
#impute missing numeric for emp_length
mean_emp_length <- loans01 %>% select_("emp_length_num") %>% summarise(mean(emp_length_num,na.rm = TRUE))
loans01[is.na(loans01[,"emp_length_num"]), "emp_length_num"] <- mean_emp_length





#target encode these during modelling on h2o - too many levels !!!!
loans01 %>% group_by(sub_grade) %>% summarise(count=n())
## # A tibble: 35 x 2
##    sub_grade count
##    <chr>     <int>
##  1 A1          250
##  2 A2          389
##  3 A3          407
##  4 A4          715
##  5 A5          688
##  6 B1          450
##  7 B2          490
##  8 B3          742
##  9 B4          606
## 10 B5          664
## # ... with 25 more rows
loans01 %>% group_by(addr_state) %>% summarise(count=n())
## # A tibble: 50 x 2
##    addr_state count
##    <chr>      <int>
##  1 AK            26
##  2 AL           119
##  3 AR            48
##  4 AZ           210
##  5 CA          1748
##  6 CO           175
##  7 CT           195
##  8 DC            54
##  9 DE            35
## 10 FL           714
## # ... with 40 more rows
loans01 %>% group_by(zip_code) %>% summarise(count=n())
## # A tibble: 720 x 2
##    zip_code count
##    <chr>    <int>
##  1 007xx        1
##  2 010xx       32
##  3 011xx        5
##  4 012xx        2
##  5 013xx        7
##  6 014xx       12
##  7 015xx       11
##  8 016xx        4
##  9 017xx       10
## 10 018xx       33
## # ... with 710 more rows

Verification_status clean up

loans01 %>% group_by(pymnt_plan) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 2 x 3
##   pymnt_plan count default
##   <chr>      <int>   <dbl>
## 1 n           9998   0.129
## 2 y              2   1
loans01 %>% group_by(verification_status) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 3 x 3
##   verification_status      count default
##   <chr>                    <int>   <dbl>
## 1 not verified              4367   0.110
## 2 VERIFIED - income         3214   0.151
## 3 VERIFIED - income source  2419   0.136
#clean up verification status
loans01$verification_status_income <- as.factor(ifelse(loans01$verification_status=="not verified",0,1))
loans01 %>% group_by(verification_status_income) %>% summarise(count=n(), default=mean(loan_defaulted))
## # A tibble: 2 x 3
##   verification_status_income count default
##   <fct>                      <int>   <dbl>
## 1 0                           4367   0.110
## 2 1                           5633   0.144

convert all character features to factor

loans01 <- loans01 %>% mutate_if(sapply(loans01,is.character),as.factor)
sapply(loans01,class)  
##                  loan_amnt                funded_amnt 
##                  "integer"                  "integer" 
##                       term                   int_rate 
##                   "factor"                   "factor" 
##                installment                      grade 
##                  "numeric"                   "factor" 
##                  sub_grade                 emp_length 
##                   "factor"                   "factor" 
##             home_ownership                 annual_inc 
##                   "factor"                  "numeric" 
##        verification_status                 pymnt_plan 
##                   "factor"                   "factor" 
##                    purpose                   zip_code 
##                   "factor"                   "factor" 
##                 addr_state                        dti 
##                   "factor"                  "numeric" 
##                delinq_2yrs           earliest_cr_line 
##                  "numeric"                   "factor" 
##             inq_last_6mths     mths_since_last_delinq 
##                  "numeric"                  "numeric" 
##                   open_acc                    pub_rec 
##                  "numeric"                  "numeric" 
##                  revol_bal                 revol_util 
##                  "integer"                  "numeric" 
##                  total_acc             loan_defaulted 
##                  "numeric"                  "integer" 
##             annual_inc_log              loan_amnt_log 
##                  "numeric"                  "numeric" 
##            funded_amnt_log              revol_bal_log 
##                  "numeric"                  "numeric" 
##            installment_log     funded_loan_amnt_ratio 
##                  "numeric"                  "numeric" 
##      installment_inc_ratio       open_total_acc_ratio 
##                  "numeric"                  "numeric" 
##               pub_rec_flag      earliest_cr_line_date 
##                   "factor"                     "Date" 
##     earliest_cr_line_years               int_rate_num 
##                  "numeric"                  "numeric" 
##             emp_length_num verification_status_income 
##                  "numeric"                   "factor"

bar graph showing count by category

head(loans01)
## # A tibble: 6 x 40
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <fct> <fct>          <dbl> <fct> <fct>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 33 more variables: emp_length <fct>, home_ownership <fct>,
## #   annual_inc <dbl>, verification_status <fct>, pymnt_plan <fct>,
## #   purpose <fct>, zip_code <fct>, addr_state <fct>, dti <dbl>,
## #   delinq_2yrs <dbl>, earliest_cr_line <fct>, inq_last_6mths <dbl>,
## #   mths_since_last_delinq <dbl>, open_acc <dbl>, pub_rec <dbl>,
## #   revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## #   funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## #   funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## #   open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## #   earliest_cr_line_date <date>, earliest_cr_line_years <dbl>,
## #   int_rate_num <dbl>, emp_length_num <dbl>,
## #   verification_status_income <fct>
char_var_names<- loans01 %>%
  select_if(is.factor) %>%
   names() %>%
  as.list()

char_var_names
## [[1]]
## [1] "term"
## 
## [[2]]
## [1] "int_rate"
## 
## [[3]]
## [1] "grade"
## 
## [[4]]
## [1] "sub_grade"
## 
## [[5]]
## [1] "emp_length"
## 
## [[6]]
## [1] "home_ownership"
## 
## [[7]]
## [1] "verification_status"
## 
## [[8]]
## [1] "pymnt_plan"
## 
## [[9]]
## [1] "purpose"
## 
## [[10]]
## [1] "zip_code"
## 
## [[11]]
## [1] "addr_state"
## 
## [[12]]
## [1] "earliest_cr_line"
## 
## [[13]]
## [1] "pub_rec_flag"
## 
## [[14]]
## [1] "verification_status_income"
for (i in seq(char_var_names)){
 g <- ggplot(loans01,aes_string(char_var_names[[i]])) + geom_bar(aes(fill=as.factor(loan_defaulted))) + coord_flip()
  print(g)
}

stacked bar graph showing proportion by category

for (i in seq(char_var_names)){
 g <- ggplot(loans01,aes_string(char_var_names[[i]])) + geom_bar(aes(fill=as.factor(loan_defaulted)),position="fill") +
  ylab("Proportion per category") + coord_flip()
  print(g)
}

density plots of numeric features

num_var_names<- loans01 %>%
  select_if(is.numeric) %>%
  select(-loan_defaulted) %>%
  names() %>%
  as.list()

num_var_names
## [[1]]
## [1] "loan_amnt"
## 
## [[2]]
## [1] "funded_amnt"
## 
## [[3]]
## [1] "installment"
## 
## [[4]]
## [1] "annual_inc"
## 
## [[5]]
## [1] "dti"
## 
## [[6]]
## [1] "delinq_2yrs"
## 
## [[7]]
## [1] "inq_last_6mths"
## 
## [[8]]
## [1] "mths_since_last_delinq"
## 
## [[9]]
## [1] "open_acc"
## 
## [[10]]
## [1] "pub_rec"
## 
## [[11]]
## [1] "revol_bal"
## 
## [[12]]
## [1] "revol_util"
## 
## [[13]]
## [1] "total_acc"
## 
## [[14]]
## [1] "annual_inc_log"
## 
## [[15]]
## [1] "loan_amnt_log"
## 
## [[16]]
## [1] "funded_amnt_log"
## 
## [[17]]
## [1] "revol_bal_log"
## 
## [[18]]
## [1] "installment_log"
## 
## [[19]]
## [1] "funded_loan_amnt_ratio"
## 
## [[20]]
## [1] "installment_inc_ratio"
## 
## [[21]]
## [1] "open_total_acc_ratio"
## 
## [[22]]
## [1] "earliest_cr_line_years"
## 
## [[23]]
## [1] "int_rate_num"
## 
## [[24]]
## [1] "emp_length_num"
for (i in seq(num_var_names)){
 g <- ggplot(loans01,aes_string(num_var_names[[i]])) + geom_density(aes(color=as.factor(loan_defaulted)))
  print(g)
}

#correlation plot

#correlation coeff. only
corr <- loans01 %>% 
  select_if(is.numeric) %>%
  #select(-loan_defaulted) %>%
  cor(use="complete.obs",method="pearson") %>%
  round(2) %>%
  as.matrix() %>%
  melt()

ggplot(data = corr, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + 
  geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 8, hjust = 1)) +
 coord_fixed()

#p-values for the correlation coeff.
#loans01 %>% 
  #select_if(is.numeric) %>%
  #as.matrix() %>%
#rcorr(type="pearson")
summary(loans01)
##    loan_amnt      funded_amnt           term         int_rate   
##  Min.   :  500   Min.   :  500   36 months:7441   10.99% : 240  
##  1st Qu.: 5000   1st Qu.: 5000   60 months:2559   11.49% : 207  
##  Median : 9600   Median : 9250                    13.49% : 196  
##  Mean   :11036   Mean   :10766                    7.88%  : 185  
##  3rd Qu.:15000   3rd Qu.:15000                    7.51%  : 180  
##  Max.   :35000   Max.   :35000                    7.49%  : 162  
##                                                   (Other):8830  
##   installment      grade      sub_grade        emp_length  
##  Min.   :  15.69   A:2449   B3     : 742   10+ years:2185  
##  1st Qu.: 163.47   B:2952   A4     : 715   < 1 year :1183  
##  Median : 275.11   C:2005   A5     : 688   2 years  :1147  
##  Mean   : 321.43   D:1369   B5     : 664   3 years  :1017  
##  3rd Qu.: 427.04   E: 769   B4     : 606   4 years  : 899  
##  Max.   :1276.60   F: 327   C1     : 516   1 year   : 872  
##                    G: 129   (Other):6069   (Other)  :2697  
##   home_ownership   annual_inc                   verification_status
##  MORTGAGE:4445   Min.   :  2000   not verified            :4367    
##  NONE    :   1   1st Qu.: 40000   VERIFIED - income       :3214    
##  OTHER   :  34   Median : 58000   VERIFIED - income source:2419    
##  OWN     : 775   Mean   : 68203                                    
##  RENT    :4745   3rd Qu.: 82000                                    
##                  Max.   :900000                                    
##                                                                    
##  pymnt_plan               purpose        zip_code      addr_state  
##  n:9998     debt_consolidation:4556   100xx  : 158   CA     :1748  
##  y:   2     credit_card       :1290   112xx  : 141   NY     : 958  
##             other             :1044   945xx  : 129   FL     : 714  
##             home_improvement  : 811   070xx  : 125   TX     : 700  
##             major_purchase    : 551   606xx  : 114   NJ     : 482  
##             small_business    : 473   900xx  : 107   VA     : 392  
##             (Other)           :1275   (Other):9226   (Other):5006  
##       dti         delinq_2yrs        earliest_cr_line inq_last_6mths  
##  Min.   : 0.00   Min.   : 0.0000   11/01/1998:  95    Min.   : 0.000  
##  1st Qu.: 8.16   1st Qu.: 0.0000   11/01/2000:  95    1st Qu.: 0.000  
##  Median :13.41   Median : 0.0000   12/01/1998:  91    Median : 1.000  
##  Mean   :13.34   Mean   : 0.1481   10/01/1998:  86    Mean   : 1.066  
##  3rd Qu.:18.69   3rd Qu.: 0.0000   10/01/1999:  84    3rd Qu.: 2.000  
##  Max.   :29.99   Max.   :11.0000   (Other)   :9544    Max.   :25.000  
##                                    NA's      :   5                    
##  mths_since_last_delinq    open_acc        pub_rec         revol_bal      
##  Min.   :  0.00         Min.   : 0.00   Min.   :0.0000   Min.   :      0  
##  1st Qu.:  0.00         1st Qu.: 6.00   1st Qu.:0.0000   1st Qu.:   3524  
##  Median :  0.00         Median : 9.00   Median :0.0000   Median :   8646  
##  Mean   : 13.22         Mean   : 9.33   Mean   :0.0601   Mean   :  14271  
##  3rd Qu.: 23.00         3rd Qu.:12.00   3rd Qu.:0.0000   3rd Qu.:  16952  
##  Max.   :120.00         Max.   :39.00   Max.   :3.0000   Max.   :1207359  
##                                                                           
##    revol_util       total_acc     loan_defaulted   annual_inc_log  
##  Min.   :  0.00   Min.   : 0.00   Min.   :0.0000   Min.   : 7.601  
##  1st Qu.: 25.00   1st Qu.:13.00   1st Qu.:0.0000   1st Qu.:10.597  
##  Median : 48.60   Median :20.00   Median :0.0000   Median :10.968  
##  Mean   : 48.47   Mean   :22.08   Mean   :0.1295   Mean   :10.962  
##  3rd Qu.: 71.80   3rd Qu.:29.00   3rd Qu.:0.0000   3rd Qu.:11.314  
##  Max.   :108.80   Max.   :90.00   Max.   :1.0000   Max.   :13.710  
##                                                                    
##  loan_amnt_log    funded_amnt_log  revol_bal_log    installment_log
##  Min.   : 6.217   Min.   : 6.217   Min.   : 0.000   Min.   :2.815  
##  1st Qu.: 8.517   1st Qu.: 8.517   1st Qu.: 8.168   1st Qu.:5.103  
##  Median : 9.170   Median : 9.132   Median : 9.065   Median :5.621  
##  Mean   : 9.063   Mean   : 9.045   Mean   : 8.665   Mean   :5.549  
##  3rd Qu.: 9.616   3rd Qu.: 9.616   3rd Qu.: 9.738   3rd Qu.:6.059  
##  Max.   :10.463   Max.   :10.463   Max.   :14.004   Max.   :7.153  
##                                                                    
##  funded_loan_amnt_ratio installment_inc_ratio open_total_acc_ratio
##  Min.   :0.002326       Min.   :0.001437      Min.   :0.0000      
##  1st Qu.:1.000000       1st Qu.:0.034832      1st Qu.:0.3333      
##  Median :1.000000       Median :0.057328      Median :0.4444      
##  Mean   :0.970474       Mean   :0.065452      Mean   :0.4738      
##  3rd Qu.:1.000000       3rd Qu.:0.088175      3rd Qu.:0.5882      
##  Max.   :1.000000       Max.   :0.435456      Max.   :1.0000      
##                                                                   
##  pub_rec_flag earliest_cr_line_date earliest_cr_line_years  int_rate_num  
##  0:9427       Min.   :1950-12-01    Min.   : 9.167         Min.   : 5.42  
##  1: 573       1st Qu.:1994-01-01    1st Qu.:16.252         1st Qu.: 9.45  
##               Median :1998-06-01    Median :19.586         Median :11.89  
##               Mean   :1997-02-24    Mean   :20.849         Mean   :12.12  
##               3rd Qu.:2001-10-01    3rd Qu.:24.000         3rd Qu.:14.65  
##               Max.   :2008-11-01    Max.   :67.085         Max.   :23.91  
##               NA's   :5                                                   
##  emp_length_num   verification_status_income
##  Min.   : 1.000   0:4367                    
##  1st Qu.: 2.000   1:5633                    
##  Median : 4.000                             
##  Mean   : 5.012                             
##  3rd Qu.: 9.000                             
##  Max.   :10.000                             
## 

drop more features based on further analysis and create a couple more!

loans01 <- loans01 %>% 
  select(-verification_status) %>%
  select(-pymnt_plan)  %>%
  select(-earliest_cr_line) %>%
  select(-earliest_cr_line_date) %>%
  select(-pub_rec)
  
  loans01 <- loans01 %>% select(-zip_code)

loans01$emp_length_na <- as.factor(ifelse(loans01$emp_length=="n/a",1,0))

head(loans01)
## # A tibble: 6 x 35
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <fct> <fct>          <dbl> <fct> <fct>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 28 more variables: emp_length <fct>, home_ownership <fct>,
## #   annual_inc <dbl>, purpose <fct>, addr_state <fct>, dti <dbl>,
## #   delinq_2yrs <dbl>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## #   open_acc <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   loan_defaulted <int>, annual_inc_log <dbl>, loan_amnt_log <dbl>,
## #   funded_amnt_log <dbl>, revol_bal_log <dbl>, installment_log <dbl>,
## #   funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## #   open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## #   earliest_cr_line_years <dbl>, int_rate_num <dbl>,
## #   emp_length_num <dbl>, verification_status_income <fct>,
## #   emp_length_na <fct>

Initialise & set up h2o cluster for modelling

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
#h2o.shutdown()
h2o.init(nthreads=-1, max_mem_size = '2g')
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         12 hours 52 minutes 
##     H2O cluster timezone:       Australia/Sydney 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.5 
##     H2O cluster version age:    26 days  
##     H2O cluster name:           H2O_started_from_R_praneesh_kvn157 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.92 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)
# Training data: Separate into x and y tibbles

x_train_tbl <- loans01 %>% select(-loan_defaulted)
y_train_tbl <- loans01 %>% select(loan_defaulted)

y_train_tbl <- y_train_tbl %>%
    mutate(loan_defaulted = loan_defaulted %>% as.character() %>% as.factor())

head(x_train_tbl)
## # A tibble: 6 x 34
##   loan_amnt funded_amnt term  int_rate installment grade sub_grade
##       <int>       <int> <fct> <fct>          <dbl> <fct> <fct>    
## 1      4000        4000 60 m… 7.29%           79.8 A     A4       
## 2     16000       16000 60 m… 18.25%         408.  F     F1       
## 3      8700        8700 36 m… 7.88%          272.  A     A5       
## 4     18000       18000 60 m… 11.49%         396.  B     B4       
## 5     16000       16000 36 m… 11.83%         530.  B     B3       
## 6      3000        3000 60 m… 15.58%          72.3 D     D3       
## # ... with 27 more variables: emp_length <fct>, home_ownership <fct>,
## #   annual_inc <dbl>, purpose <fct>, addr_state <fct>, dti <dbl>,
## #   delinq_2yrs <dbl>, inq_last_6mths <dbl>, mths_since_last_delinq <dbl>,
## #   open_acc <dbl>, revol_bal <int>, revol_util <dbl>, total_acc <dbl>,
## #   annual_inc_log <dbl>, loan_amnt_log <dbl>, funded_amnt_log <dbl>,
## #   revol_bal_log <dbl>, installment_log <dbl>,
## #   funded_loan_amnt_ratio <dbl>, installment_inc_ratio <dbl>,
## #   open_total_acc_ratio <dbl>, pub_rec_flag <fct>,
## #   earliest_cr_line_years <dbl>, int_rate_num <dbl>,
## #   emp_length_num <dbl>, verification_status_income <fct>,
## #   emp_length_na <fct>
head(y_train_tbl)
## # A tibble: 6 x 1
##   loan_defaulted
##   <fct>         
## 1 0             
## 2 0             
## 3 0             
## 4 0             
## 5 0             
## 6 0

create h2o data set

data_h2o <- as.h2o(bind_cols(y_train_tbl, x_train_tbl))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

split into training and holdout set for estimating target encoding mean

splits_h2o <- h2o.splitFrame(data_h2o, ratios = c(0.9), seed = 1234)

train_h2o <- splits_h2o[[1]]
te_holdout_h2o <- splits_h2o[[2]]

Creating target encoding map on high cardinality categoricals features - to minimise overfitting in training

te_cols<- c("sub_grade","addr_state")

te_map <- h2o.target_encode_create(te_holdout_h2o, x = as.list(te_cols), y = "loan_defaulted")

te_map$sub_grade
##   sub_grade numerator denominator
## 1        A1         0          19
## 2        A2         0          42
## 3        A3         2          37
## 4        A4         4          78
## 5        A5         7          69
## 6        B1         5          39
## 
## [35 rows x 3 columns]
te_map$addr_state
##   addr_state numerator denominator
## 1         AK         1           4
## 2         AL         3          10
## 3         AR         0           7
## 4         AZ         2          19
## 5         CA        24         173
## 6         CO         5          17
## 
## [43 rows x 3 columns]

Apply the target encoding to the training data set. Use the following param for the training data:

Holdout_type is none as there is a separate te_holdout set.

ext_train_h2o <- h2o.target_encode_apply(train_h2o, x = as.list(te_cols), y = "loan_defaulted",
                                     target_encode_map = te_map, 
                                     holdout_type = "None",
                                      noise_level = 0,
                                     seed = 1234
                                     )

head(ext_train_h2o[c("sub_grade", "TargetEncode_sub_grade")])
##   sub_grade TargetEncode_sub_grade
## 1        A3             0.06406758
## 2        A3             0.06406758
## 3        A4             0.05148609
## 4        A5             0.10157812
## 5        A5             0.10157812
## 6        B2             0.07239862
head(ext_train_h2o[c("addr_state", "TargetEncode_addr_state")])
##   addr_state TargetEncode_addr_state
## 1         AK               0.1409067
## 2         AK               0.1409067
## 3         AK               0.1409067
## 4         AK               0.1409067
## 5         AK               0.1409067
## 6         AK               0.1409067
str(ext_train_h2o)
## Class 'H2OFrame' <environment: 0x7f997cf53428> 
##  - attr(*, "op")= chr "colnames="
##  - attr(*, "eval")= logi TRUE
##  - attr(*, "id")= chr "RTMP_sid_80a3_39"
##  - attr(*, "nrow")= int 8999
##  - attr(*, "ncol")= int 37
##  - attr(*, "types")=List of 37
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "int"
##   ..$ : chr "int"
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "int"
##   ..$ : chr "int"
##   ..$ : chr "int"
##   ..$ : chr "int"
##   ..$ : chr "int"
##   ..$ : chr "real"
##   ..$ : chr "int"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##   ..$ : chr "enum"
##   ..$ : chr "real"
##   ..$ : chr "real"
##  - attr(*, "data")='data.frame': 10 obs. of  37 variables:
##   ..$ addr_state                : Factor w/ 50 levels "AK","AL","AR",..: 1 1 1 1 1 1 1 1 1 1
##   ..$ sub_grade                 : Factor w/ 35 levels "A1","A2","A3",..: 3 3 4 5 5 7 8 9 10 11
##   ..$ loan_defaulted            : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1
##   ..$ loan_amnt                 : num  8400 12400 3000 4000 15000 6200 8000 1000 23000 5000
##   ..$ funded_amnt               : num  8400 12400 3000 4000 15000 6200 8000 1000 23000 5000
##   ..$ term                      : Factor w/ 2 levels "36 months","60 months": 1 1 1 1 1 1 2 1 2 1
##   ..$ int_rate                  : Factor w/ 359 levels "10.00%","10.01%",..: 314 321 327 338 326 17 349 51 72 78
##   ..$ installment               : num  259.9 385.8 93.9 127 469.2 ...
##   ..$ grade                     : Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 2 2 2 2 3
##   ..$ emp_length                : Factor w/ 12 levels "1 year","10+ years",..: 5 2 5 3 2 2 5 2 2 11
##   ..$ home_ownership            : Factor w/ 5 levels "MORTGAGE","NONE",..: 1 1 5 1 1 1 5 1 1 5
##   ..$ annual_inc                : num  120000 36800 45000 112000 78000 24000 55200 87000 80000 36000
##   ..$ purpose                   : Factor w/ 14 levels "car","credit_card",..: 7 3 3 5 3 2 3 10 3 10
##   ..$ dti                       : num  4.1 5.84 19.39 3.19 11.09 ...
##   ..$ delinq_2yrs               : num  0 0 0 0 0 0 0 1 0 0
##   ..$ inq_last_6mths            : num  0 0 0 0 0 1 1 0 1 3
##   ..$ mths_since_last_delinq    : num  0 0 0 72 0 0 0 13 0 0
##   ..$ open_acc                  : num  18 5 10 5 7 10 6 5 14 9
##   ..$ revol_bal                 : num  8761 6975 16487 554 40665 ...
##   ..$ revol_util                : num  8.7 46.8 63.2 39.6 61.3 55.2 15.2 55.2 60.3 82
##   ..$ total_acc                 : num  48 15 15 12 16 11 21 18 54 12
##   ..$ annual_inc_log            : num  11.7 10.5 10.7 11.6 11.3 ...
##   ..$ loan_amnt_log             : num  9.04 9.43 8.01 8.29 9.62 ...
##   ..$ funded_amnt_log           : num  9.04 9.43 8.01 8.29 9.62 ...
##   ..$ revol_bal_log             : num  9.08 8.85 9.71 6.32 10.61 ...
##   ..$ installment_log           : num  5.56 5.96 4.55 4.85 6.15 ...
##   ..$ funded_loan_amnt_ratio    : num  1 1 1 1 1 1 1 1 1 1
##   ..$ installment_inc_ratio     : num  0.026 0.1258 0.025 0.0136 0.0722 ...
##   ..$ open_total_acc_ratio      : num  0.375 0.333 0.667 0.417 0.438 ...
##   ..$ pub_rec_flag              : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1
##   ..$ earliest_cr_line_years    : num  29.7 13.9 16.6 15.3 24.3 ...
##   ..$ int_rate_num              : num  7.14 7.51 7.9 8.9 7.88 ...
##   ..$ emp_length_num            : num  4 10 4 2 10 10 4 10 10 1
##   ..$ verification_status_income: Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 1 2 1
##   ..$ emp_length_na             : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1
##   ..$ TargetEncode_sub_grade    : num  0.0641 0.0641 0.0515 0.1016 0.1016 ...
##   ..$ TargetEncode_addr_state   : num  0.141 0.141 0.141 0.141 0.141 ...

Train model with holdout target encoding. Replace Categorical with TargetEncoded variables.

y <- "loan_defaulted"
x <- setdiff(names(train_h2o), c(y, te_cols))

tic("Model_Training")

automl_models_h2o <- h2o.automl(
    x = x,
    y = y,
    training_frame    = train_h2o,
    nfolds = 5,
    #max_models = 10, #stop when either max of # models built or time runs out.
    max_runtime_secs  = 90
        # Early Stopping
    #stopping_rounds = 5, stopping_metric = "AUC", 
    #stopping_tolerance = 0.001, seed = 1234
  
)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |=================================================================| 100%
toc()
## Model_Training: 108.369 sec elapsed
automl_leader <- automl_models_h2o@leader